The objective of this project is to study different Bouroughs of NYC to find out optimal location to buy a property.
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", palette="pastel", color_codes=True)
# for geospatial data visualization
import folium
from folium.plugins import FastMarkerCluster
import datetime
# for web scraping and dealing with json data
import requests
import json
from pandas.io.json import json_normalize
# to get longitudes and Latitudes of addresses
from geopy.geocoders import Nominatim
# import k-means from clustering stage
from sklearn.cluster import KMeans
print('Libraries imported.')
df_Permit = pd.read_csv("C:/Users/archd/OneDrive/Desktop/Projects/DOB_Permit_Issuance.csv")
print("Data imported.")
df_Permit.head()
Checking null values in the dataset:
df_Permit.isnull().sum()
Checking number of observations and variables in the dataset.
df_Permit.shape
The DOB Permit Issuance dataset has 3508249 observations and 60 variables. For this analysis we do not require all 60 variables and after studying all the variables, I have decided to keep only 14 variables which are described below.
cols = ["Bin #","BOROUGH","Job Type", "Residential","Permit Status","Filing Status","Oil Gas","Site Fill",
"Issuance Date", "Expiration Date", "Non-Profit","Owner's Business Type","LATITUDE","LONGITUDE"]
df_Permit = df_Permit[cols]
df_Permit.shape
df_Permit.isnull().sum()
df_Permit[["LONGITUDE","LATITUDE"]].head()
df_Permit[["LONGITUDE","LATITUDE"]].isnull().sum()
We will have to drop the observations which do not have longitude and latitude values as we cannnot know exact locations of buildings without them. We can fill the values of Longitudes and Latitudes depends on the address but it should be perfect. Also, finding longitudes and latitudes of 12000 observations will require time as well as computation power.
df_Permit = df_Permit.dropna(subset = ["LONGITUDE","LATITUDE"])
df_Permit[["LONGITUDE","LATITUDE"]].isnull().sum()
df_Permit[["Issuance Date", "Expiration Date"]].head(10)
df_Permit[["Issuance Date", "Expiration Date"]].isnull().sum()
df_Permit[["Issuance Date", "Expiration Date"]][df_Permit["Issuance Date"].isnull()].head(10)
df_Permit[["Issuance Date", "Expiration Date"]][df_Permit["Expiration Date"].isnull()].head(10)
All of these dates are in string format. I will have to convert them into Datetime format.
df_Permit["Issuance Date"] = pd.to_datetime(df_Permit["Issuance Date"])
df_Permit["Issuance Date"].head()
df_Permit["Expiration Date"] = pd.to_datetime(df_Permit["Expiration Date"])
df_Permit["Expiration Date"].head()
I am interested into those buildings whose permits expire in the future. This will tell me which buildings are under construction right now. For this I will consider only those buildings whose permit expiration date is later than today's date.
present = str(datetime.datetime.now().date())
df_Permit = df_Permit[(df_Permit["Expiration Date"] > present) | (df_Permit["Expiration Date"].isnull())]
df_Permit["Permit Status"].value_counts()
df_Permit[["Permit Status","Expiration Date"]][df_Permit["Permit Status"] == "IN PROCESS"].head(20)
As we can see for buildings whose permit applications are in process, do not have permit expiration date. This is right and we will have to keep these observations in our analysis.
df_Permit = df_Permit[(df_Permit["Expiration Date"].notnull()) | ((df_Permit["Permit Status"] == "IN PROCESS") & (df_Permit["Expiration Date"].isnull()))]
df_Permit.shape
df_Permit[["Permit Status","Expiration Date"]][df_Permit["Permit Status"].isnull()].head()
As we can see for all null values of Permit Status, Expiration date exsist. This tells us that Permit Status is issued. We can fill the null values of Permit Status as ISSUED. Also, I am renaming RE-ISSUED as ISSUED because I want to findout which buldings will be under construction in the future.
df_Permit["Permit Status"] = df_Permit["Permit Status"].fillna("ISSUED")
df_Permit["Permit Status"] = df_Permit["Permit Status"].replace({"RE-ISSUED":"ISSUED"})
df_Permit["Permit Status"].value_counts()
df_Permit["Permit Status"].isnull().sum()
sns.countplot(df_Permit["Permit Status"])
Let's Visualize for which buildings Applications are under process as they will be under construction in near future.
temp_df = df_Permit[df_Permit["Permit Status"]=="IN PROCESS"]
temp_df.shape
boroughs = df_Permit["BOROUGH"].unique()
temp_df2 = df_Permit[["BOROUGH","LONGITUDE","LATITUDE"]]
borough_temp_df = pd.DataFrame(columns = ["BOROUGH","LONGITUDE","LATITUDE"])
borough_temp_df
for i in range(len(boroughs)):
borough_temp_df = borough_temp_df.append({'BOROUGH': boroughs[i], 'LONGITUDE': temp_df2["LONGITUDE"][temp_df2["BOROUGH"]==boroughs[i]].mean(), 'LATITUDE': temp_df2["LATITUDE"][temp_df2["BOROUGH"]==boroughs[i]].mean()}, ignore_index=True)
borough_temp_df
address = 'New York City, NY'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)
map_newyork.add_child(FastMarkerCluster(temp_df[["LATITUDE","LONGITUDE"]].values.tolist()))
# add markers to map
for lat, lng, borough in zip(borough_temp_df['LATITUDE'], borough_temp_df['LONGITUDE'], borough_temp_df['BOROUGH']):
label = '{}'.format(borough)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=50,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.3,
parse_html=False).add_to(map_newyork)
map_newyork
plt.figure(figsize = (8,5))
sns.countplot(x = df_Permit["BOROUGH"], hue = df_Permit["Permit Status"])
plt.show()
We can find the length of time for which the permit is issued from the Permit Issuance Date and Permit Expiration Date.
df_Permit["Permit Length"] = (df_Permit["Expiration Date"] - df_Permit["Issuance Date"]).astype('timedelta64[D]').values
df_Permit["Permit Length"].head()
df_Permit["Permit Length"][df_Permit["Permit Length"].notnull()].describe()
m = df_Permit["Permit Length"][df_Permit["Permit Length"].notnull()].mean()
m
We have null values in the variable Permit Length as either Issuance or Expiration Date is null. I am filling this value with the mean value.
df_Permit["Permit Length"] = df_Permit["Permit Length"].fillna(m)
Now we do not need Permit Issuance Date and Expiration Date. I am dropping them.
df_Permit.drop(["Issuance Date", "Expiration Date"], axis =1, inplace = True)
df_Permit["BOROUGH"].value_counts()
plt.figure(figsize = (8,5))
sns.countplot(df_Permit["BOROUGH"],order = df_Permit['BOROUGH'].value_counts().index,palette=("Blues_d"))
plt.show()
As we can see Manhattan is rapidly growing as so most of the DOB permits are issued for buildings of this borough.
temp_df = df_Permit[df_Permit["BOROUGH"]=="MANHATTAN"]
sns.countplot(x = temp_df["Job Type"],order = temp_df["Job Type"].value_counts().index)
plt.title("Type of Job in Manhattan")
In Manhattan most of the permits are issued for considerable alterations in buildings followed by minor and most alterations. The permits issued for new buildings are very less which tells us that this borough is developed from the long time and buying a property here will be really costly. Since many alterations are going on, there will be a considerable noise pollution as well in this borough.
df_Permit["Job Type"].value_counts()
sns.countplot(x = df_Permit["Job Type"],order = df_Permit["Job Type"].value_counts().index)
plt.title("Type of Job in NYC")
As we can see most of the permits are issued for considerable alterations followed by new buidlings, major alterations, minor alterations, Demolition and Sign changes.
plt.figure(figsize = (12, 8))
sns.countplot(x = df_Permit["Job Type"], hue = df_Permit["BOROUGH"])
plt.show()
Above graph tells us that Queens and Brooklyn are developing boroughs as many new buildings are being built there. Let's see them in map.
temp_df = df_Permit[df_Permit["Job Type"]=="NB"]
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)
map_newyork.add_child(FastMarkerCluster(temp_df[["LATITUDE","LONGITUDE"]].values.tolist()))
# add markers as boroughs to map
for lat, lng, borough in zip(borough_temp_df['LATITUDE'], borough_temp_df['LONGITUDE'], borough_temp_df['BOROUGH']):
label = '{}'.format(borough)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=50,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.3,
parse_html=False).add_to(map_newyork)
map_newyork
df_Permit["Residential"].value_counts()
We can fill null values of Residential as "NO" as they might not be used for residential purpose.
df_Permit["Residential"] = df_Permit["Residential"].fillna("NO")
df_Permit["Residential"].value_counts()
sns.countplot(x = df_Permit["Residential"])
plt.figure(figsize = (8,5))
sns.countplot(x = df_Permit["BOROUGH"], hue = df_Permit["Residential"])
plt.show()
Brooklyn and Queens have most of the properties for residential purpose. Also, many new buildings are being built there. Those boroughs can be very good choices for buying houses/apartments.
temp_df = df_Permit[df_Permit["Residential"]=="YES"]
temp_df.shape
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)
map_newyork.add_child(FastMarkerCluster(temp_df[["LATITUDE","LONGITUDE"]][1:55000].values.tolist()))
# add markers as boroughs to map
for lat, lng, borough in zip(borough_temp_df['LATITUDE'], borough_temp_df['LONGITUDE'], borough_temp_df['BOROUGH']):
label = '{}'.format(borough)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=50,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.3,
parse_html=False).add_to(map_newyork)
map_newyork #showing 55000 values because of the limitation of the folium library
df_Permit["Residential"] = df_Permit["Residential"].replace({"NO":0, "YES":1})
df_Permit["Oil Gas"].value_counts()
df_Permit["Oil Gas"].isnull().sum()
df_Permit["Oil Gas"] = df_Permit["Oil Gas"].fillna("NONE")
sns.countplot(df_Permit["Oil Gas"])
sns.countplot(hue = df_Permit["Oil Gas"], x = df_Permit["Job Type"])
Most of the buildings do not require OIL/GAS and this variable might not be helpful for my analysis.I am dropping this variable.
df_Permit.drop("Oil Gas", axis =1, inplace = True)
df_Permit["Owner\'s Business Type"].value_counts()
sns.countplot(df_Permit["Owner\'s Business Type"])
plt.xticks(rotation = 90)
For all Business types who have less few building permits, I will consider them under Other type for simplicity.
df_Permit["Owner\'s Business Type"] = df_Permit["Owner\'s Business Type"].replace({"NYCHA":"OTHER", "HPD":"OTHER",
"DCAS":"OTHER","NY STATE":"OTHER",
"DOE":"OTHER","HHC":"OTHER",
"NYC AGENCY":"OTHER","NYCHA/HHC":"OTHER",
"OTHER GOV'T AGENCY": "OTHER" })
df_Permit["Owner\'s Business Type"] = df_Permit["Owner\'s Business Type"].fillna("OTHER")
df_Permit["Owner\'s Business Type"].value_counts()
sns.countplot(df_Permit["Owner\'s Business Type"],order = df_Permit["Owner\'s Business Type"].value_counts().index)
plt.xticks(rotation = 90)
Let's visualize which businesses are building new properties.
temp_df = df_Permit[df_Permit["Job Type"]=="NB"]
temp_df.shape
plt.figure(figsize=(8,5))
sns.countplot(temp_df["Owner\'s Business Type"], order = temp_df["Owner\'s Business Type"].value_counts().index)
plt.title("New Buildings under construction by Different types of businesses")
plt.show()
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, businessType in zip(temp_df['LATITUDE'][1:1000], temp_df['LONGITUDE'][1:1000], temp_df['Owner\'s Business Type'][1:1000]):
label = '{}'.format(businessType)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=5,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.3,
parse_html=False).add_to(map_newyork)
map_newyork
Most of the new buildings are being built by Corporation followed by Individual and Partnership.
df_Permit["Non-Profit"].value_counts()
df_Permit["Non-Profit"].isnull().sum()
df_Permit["Non-Profit"] = df_Permit["Non-Profit"].fillna("N")
sns.countplot(df_Permit["Non-Profit"])
plt.figure(figsize = (8,5))
sns.countplot(x = df_Permit["BOROUGH"], hue = df_Permit["Non-Profit"])
plt.show()
df_Permit["Non-Profit"] = df_Permit["Non-Profit"].replace({"N":0, "Y":1})
temp_df = df_Permit[df_Permit["Non-Profit"]== 1]
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough in zip(temp_df['LATITUDE'], temp_df['LONGITUDE'], temp_df['BOROUGH'][1:1000]):
label = '{}'.format(borough)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=5,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.3,
parse_html=False).add_to(map_newyork)
map_newyork
The above map shows the non-profit buildings for which permits are issued. Manhattan has maximum density of non-profit organizations.
df_Permit["Site Fill"].value_counts()
df_Permit["Site Fill"] = df_Permit["Site Fill"].fillna("NONE")
df_Permit["Site Fill"] = df_Permit["Site Fill"].replace({"NOT APPLICABLE":0, "NONE":0, "ON-SITE":1,"OFF-SITE":1, "USE UNDER 300 CU.YD":1 })
df_Permit["Site Fill"].value_counts()
sns.countplot(df_Permit["Site Fill"])
plt.figure(figsize = (8,5))
sns.countplot(x = df_Permit["BOROUGH"], hue = df_Permit["Site Fill"])
plt.show()
The value of Site fill is True where mostly new buildings are being built.
df_Permit["Filing Status"].value_counts()
plt.figure(figsize = (8,5))
sns.countplot(x = df_Permit["BOROUGH"],hue = df_Permit["Filing Status"])
plt.show()
df_Permit["Filing Status"] = df_Permit["Filing Status"].replace({"INITIAL":1, "RENEWAL":2})
df_Permit.isnull().sum()
I do not require Building Identification Number and I am dropping it.
df_Permit.drop("Bin #", axis =1, inplace = True)
df_Permit.dtypes
We will have to remove Borough and Longitude, Latitude to make clusters.
df = df_Permit.drop(["BOROUGH","LATITUDE","LONGITUDE"], axis = 1)
catcols = df.columns[df.dtypes == object]
df2 = pd.get_dummies(df, columns=catcols)
df2.head()
df2.shape
# set number of clusters
kclusters = 5
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df2)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]
df_Permit_merged = df_Permit.copy()
# add clustering labels
df_Permit_merged['Cluster Labels'] = kmeans.labels_
df_Permit_merged.head()
Let's visualize the clusters and Borough relationship.
plt.figure(figsize = (10,5))
sns.countplot(x=df_Permit_merged["BOROUGH"], hue = df_Permit_merged["Cluster Labels"])
plt.show()
As we can see that, K-means clustering does not yield any good results. All these buildings do not have similar characteristics. To find similarity we need more variables to analyze.
borough_temp_df
CLIENT_ID = 'QUSLVNRGP23EPMS0B4EFN2TIMKOHSCGXXU1MOMBF04CXL4PH' # your Foursquare ID
CLIENT_SECRET = 'F05SSCAKGGAXINRW2JYPYN1FH155TIVYCQDSXHAAHVEY4HIQ' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
# function that extracts the category of the venue
def get_category_type(row):
try:
categories_list = row['categories']
except:
categories_list = row['venue.categories']
if len(categories_list) == 0:
return None
else:
return categories_list[0]['name']
def venuesFind(latitude, longitude, radius):
radius = 3000
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
latitude,
longitude,
radius,
LIMIT)
results = requests.get(url).json()
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
map_borough = folium.Map(location=[latitude, longitude], zoom_start=13)
for row in nearby_venues.itertuples():
map_borough.add_child(folium.Marker(location=[row.lat, row.lng], popup=row.categories))
return nearby_venues, map_borough
address = 'Bronx, NY'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Bronx are {}, {}.'.format(latitude, longitude))
nearby_venues_Bronx , map_bronx = venuesFind(latitude, longitude, 3000)
nearby_venues_Bronx.head()
nearby_venues_Bronx["categories"].value_counts().head(10)
map_bronx
address = 'Manhattan, NY'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))
nearby_venues_Manhattan , map_Manhattan = venuesFind(latitude, longitude, 3000)
nearby_venues_Manhattan.head()
nearby_venues_Manhattan["categories"].value_counts().head(10)
map_Manhattan
address = 'Brooklyn, NY'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Brooklyn are {}, {}.'.format(latitude, longitude))
nearby_venues_Brooklyn , map_Brooklyn = venuesFind(latitude, longitude, 3000)
nearby_venues_Brooklyn.head()
nearby_venues_Brooklyn["categories"].value_counts().head(10)
map_Brooklyn
address = 'Queens, NY'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Queens are {}, {}.'.format(latitude, longitude))
nearby_venues_Queens , map_Queens = venuesFind(latitude, longitude, 3000)
nearby_venues_Queens.head()
nearby_venues_Queens["categories"].value_counts().head(10)
map_Queens
address = 'Staten Island, NY'
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Staten Island are {}, {}.'.format(latitude, longitude))
nearby_venues_SI , map_SI = venuesFind(latitude, longitude, 3000)
nearby_venues_SI.head()
nearby_venues_SI["categories"].value_counts().head(10)
map_SI